In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl 


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import plotly.express as px
import plotly.graph_objects as go
In [2]:
%matplotlib inline
In [3]:
plt.style.use("ggplot")
In [4]:
df = pd.read_csv("../HW2/Bacteria2.csv")
In [5]:
df
Out[5]:
Bacteria Penicilin Streptomycin Neomycin Gram
0 Aerobacter aerogenes 870.000 1.00 1.600 negative
1 Brucella abortus 1.000 2.00 0.020 negative
2 Brucella anthracis 0.001 0.01 0.007 positive
3 Diplococcus pneumoniae 0.005 11.00 10.000 positive
4 Escherichia coli 100.000 0.40 0.100 negative
5 Klebsiella pneumoniae 850.000 1.20 1.000 negative
6 Mycobacterium tuberculosis 800.000 5.00 2.000 negative
7 Proteus vulgaris 3.000 0.10 0.100 negative
8 Pseudomonas aeruginosa 850.000 2.00 0.400 negative
9 Salmonella (Eberthella) typhosa 1.000 0.40 0.008 negative
10 Salmonella schottmuelleri 10.000 0.80 0.090 negative
11 Staphylococcus albus 0.007 0.10 0.001 positive
12 Staphylococcus aureus 0.030 0.03 0.001 positive
13 Streptococcus fecalis 1.000 1.00 0.100 positive
14 Streptococcus hemolyticus 0.001 14.00 10.000 positive
15 Streptococcus viridans 0.005 10.00 40.000 positive
In [6]:
df.columns
Out[6]:
Index(['Bacteria', 'Penicilin', 'Streptomycin', 'Neomycin', 'Gram'], dtype='object')
In [7]:
df.Penicilin
Out[7]:
0     870.000
1       1.000
2       0.001
3       0.005
4     100.000
5     850.000
6     800.000
7       3.000
8     850.000
9       1.000
10     10.000
11      0.007
12      0.030
13      1.000
14      0.001
15      0.005
Name: Penicilin, dtype: float64
In [8]:
df["PenicilinL"] = np.log10(df.Penicilin)
df['StreptomycinL'] = np.log10(df.Streptomycin)
df['NeomycinL'] = np.log10(df.Neomycin)
In [9]:
df
Out[9]:
Bacteria Penicilin Streptomycin Neomycin Gram PenicilinL StreptomycinL NeomycinL
0 Aerobacter aerogenes 870.000 1.00 1.600 negative 2.939519 0.000000 0.204120
1 Brucella abortus 1.000 2.00 0.020 negative 0.000000 0.301030 -1.698970
2 Brucella anthracis 0.001 0.01 0.007 positive -3.000000 -2.000000 -2.154902
3 Diplococcus pneumoniae 0.005 11.00 10.000 positive -2.301030 1.041393 1.000000
4 Escherichia coli 100.000 0.40 0.100 negative 2.000000 -0.397940 -1.000000
5 Klebsiella pneumoniae 850.000 1.20 1.000 negative 2.929419 0.079181 0.000000
6 Mycobacterium tuberculosis 800.000 5.00 2.000 negative 2.903090 0.698970 0.301030
7 Proteus vulgaris 3.000 0.10 0.100 negative 0.477121 -1.000000 -1.000000
8 Pseudomonas aeruginosa 850.000 2.00 0.400 negative 2.929419 0.301030 -0.397940
9 Salmonella (Eberthella) typhosa 1.000 0.40 0.008 negative 0.000000 -0.397940 -2.096910
10 Salmonella schottmuelleri 10.000 0.80 0.090 negative 1.000000 -0.096910 -1.045757
11 Staphylococcus albus 0.007 0.10 0.001 positive -2.154902 -1.000000 -3.000000
12 Staphylococcus aureus 0.030 0.03 0.001 positive -1.522879 -1.522879 -3.000000
13 Streptococcus fecalis 1.000 1.00 0.100 positive 0.000000 0.000000 -1.000000
14 Streptococcus hemolyticus 0.001 14.00 10.000 positive -3.000000 1.146128 1.000000
15 Streptococcus viridans 0.005 10.00 40.000 positive -2.301030 1.000000 1.602060
In [10]:
from sklearn.manifold import MDS
In [11]:
logdf = df[["PenicilinL", "StreptomycinL", "NeomycinL"]]
In [12]:
logdf
Out[12]:
PenicilinL StreptomycinL NeomycinL
0 2.939519 0.000000 0.204120
1 0.000000 0.301030 -1.698970
2 -3.000000 -2.000000 -2.154902
3 -2.301030 1.041393 1.000000
4 2.000000 -0.397940 -1.000000
5 2.929419 0.079181 0.000000
6 2.903090 0.698970 0.301030
7 0.477121 -1.000000 -1.000000
8 2.929419 0.301030 -0.397940
9 0.000000 -0.397940 -2.096910
10 1.000000 -0.096910 -1.045757
11 -2.154902 -1.000000 -3.000000
12 -1.522879 -1.522879 -3.000000
13 0.000000 0.000000 -1.000000
14 -3.000000 1.146128 1.000000
15 -2.301030 1.000000 1.602060
In [13]:
df_MDS = MDS(n_components=2).fit_transform(logdf)
In [14]:
df_MDS
Out[14]:
array([[-2.50140065, -1.77138718],
       [ 0.01711131,  0.57776751],
       [ 2.3777941 ,  3.08819238],
       [ 2.89605844, -1.29014142],
       [-1.97173541, -0.28504887],
       [-2.51786281, -1.61837219],
       [-2.26187438, -2.19891479],
       [-0.82293742,  0.60928886],
       [-2.63552152, -1.31055685],
       [-0.31132106,  1.2724154 ],
       [-0.93461712, -0.15453806],
       [ 1.34423962,  2.93766582],
       [ 0.65608948,  3.01588131],
       [ 0.04672363,  0.07080214],
       [ 3.5832936 , -1.12704749],
       [ 3.03596019, -1.81600658]])
In [15]:
df_MDS.shape
Out[15]:
(16, 2)
In [16]:
df["MDS_x"] = df_MDS[:,0]
df['MDS_y'] = df_MDS[:,1]
In [17]:
df
Out[17]:
Bacteria Penicilin Streptomycin Neomycin Gram PenicilinL StreptomycinL NeomycinL MDS_x MDS_y
0 Aerobacter aerogenes 870.000 1.00 1.600 negative 2.939519 0.000000 0.204120 -2.501401 -1.771387
1 Brucella abortus 1.000 2.00 0.020 negative 0.000000 0.301030 -1.698970 0.017111 0.577768
2 Brucella anthracis 0.001 0.01 0.007 positive -3.000000 -2.000000 -2.154902 2.377794 3.088192
3 Diplococcus pneumoniae 0.005 11.00 10.000 positive -2.301030 1.041393 1.000000 2.896058 -1.290141
4 Escherichia coli 100.000 0.40 0.100 negative 2.000000 -0.397940 -1.000000 -1.971735 -0.285049
5 Klebsiella pneumoniae 850.000 1.20 1.000 negative 2.929419 0.079181 0.000000 -2.517863 -1.618372
6 Mycobacterium tuberculosis 800.000 5.00 2.000 negative 2.903090 0.698970 0.301030 -2.261874 -2.198915
7 Proteus vulgaris 3.000 0.10 0.100 negative 0.477121 -1.000000 -1.000000 -0.822937 0.609289
8 Pseudomonas aeruginosa 850.000 2.00 0.400 negative 2.929419 0.301030 -0.397940 -2.635522 -1.310557
9 Salmonella (Eberthella) typhosa 1.000 0.40 0.008 negative 0.000000 -0.397940 -2.096910 -0.311321 1.272415
10 Salmonella schottmuelleri 10.000 0.80 0.090 negative 1.000000 -0.096910 -1.045757 -0.934617 -0.154538
11 Staphylococcus albus 0.007 0.10 0.001 positive -2.154902 -1.000000 -3.000000 1.344240 2.937666
12 Staphylococcus aureus 0.030 0.03 0.001 positive -1.522879 -1.522879 -3.000000 0.656089 3.015881
13 Streptococcus fecalis 1.000 1.00 0.100 positive 0.000000 0.000000 -1.000000 0.046724 0.070802
14 Streptococcus hemolyticus 0.001 14.00 10.000 positive -3.000000 1.146128 1.000000 3.583294 -1.127047
15 Streptococcus viridans 0.005 10.00 40.000 positive -2.301030 1.000000 1.602060 3.035960 -1.816007
In [18]:
bacteriaAbbreviations = {'Aerobacter aerogenes':"Aerobacter", 
                         'Brucella abortus':"Brucella ABO", 
                         'Brucella anthracis':"Brucella ANT",
                        'Diplococcus pneumoniae': "Diplococcus", 
                         'Escherichia coli': "Escherichia",
                           'Klebsiella pneumoniae': "Klebsiella",
                         'Mycobacterium tuberculosis':'Mycobacterium' ,
                           'Proteus vulgaris':'Proteus', 
                         'Pseudomonas aeruginosa':"Pseudomonas" ,
                        'Salmonella (Eberthella) typhosa':'Sal. typhosa', 
                         'Salmonella schottmuelleri':'Sal. scho.',
                          'Staphylococcus albus':'S. albus', 
                         'Staphylococcus aureus':'S. aureus',
                         'Streptococcus fecalis':"S. fecalis", 
                         'Streptococcus hemolyticus':"S. hemolyticus",
                         'Streptococcus viridans':"S. viridans"}
In [19]:
df['bacteriaAbbr'] = df.Bacteria.map(bacteriaAbbreviations)
In [20]:
df.head()
Out[20]:
Bacteria Penicilin Streptomycin Neomycin Gram PenicilinL StreptomycinL NeomycinL MDS_x MDS_y bacteriaAbbr
0 Aerobacter aerogenes 870.000 1.00 1.600 negative 2.939519 0.000000 0.204120 -2.501401 -1.771387 Aerobacter
1 Brucella abortus 1.000 2.00 0.020 negative 0.000000 0.301030 -1.698970 0.017111 0.577768 Brucella ABO
2 Brucella anthracis 0.001 0.01 0.007 positive -3.000000 -2.000000 -2.154902 2.377794 3.088192 Brucella ANT
3 Diplococcus pneumoniae 0.005 11.00 10.000 positive -2.301030 1.041393 1.000000 2.896058 -1.290141 Diplococcus
4 Escherichia coli 100.000 0.40 0.100 negative 2.000000 -0.397940 -1.000000 -1.971735 -0.285049 Escherichia
In [21]:
fig = px.scatter(df, x="MDS_x", y="MDS_y",
                text="bacteriaAbbr", width=800, height=800)


fig
In [22]:
df['BestAntibiotic'] = df[["Penicilin", "Streptomycin", "Neomycin"]].idxmin(axis=1)
In [23]:
df
Out[23]:
Bacteria Penicilin Streptomycin Neomycin Gram PenicilinL StreptomycinL NeomycinL MDS_x MDS_y bacteriaAbbr BestAntibiotic
0 Aerobacter aerogenes 870.000 1.00 1.600 negative 2.939519 0.000000 0.204120 -2.501401 -1.771387 Aerobacter Streptomycin
1 Brucella abortus 1.000 2.00 0.020 negative 0.000000 0.301030 -1.698970 0.017111 0.577768 Brucella ABO Neomycin
2 Brucella anthracis 0.001 0.01 0.007 positive -3.000000 -2.000000 -2.154902 2.377794 3.088192 Brucella ANT Penicilin
3 Diplococcus pneumoniae 0.005 11.00 10.000 positive -2.301030 1.041393 1.000000 2.896058 -1.290141 Diplococcus Penicilin
4 Escherichia coli 100.000 0.40 0.100 negative 2.000000 -0.397940 -1.000000 -1.971735 -0.285049 Escherichia Neomycin
5 Klebsiella pneumoniae 850.000 1.20 1.000 negative 2.929419 0.079181 0.000000 -2.517863 -1.618372 Klebsiella Neomycin
6 Mycobacterium tuberculosis 800.000 5.00 2.000 negative 2.903090 0.698970 0.301030 -2.261874 -2.198915 Mycobacterium Neomycin
7 Proteus vulgaris 3.000 0.10 0.100 negative 0.477121 -1.000000 -1.000000 -0.822937 0.609289 Proteus Streptomycin
8 Pseudomonas aeruginosa 850.000 2.00 0.400 negative 2.929419 0.301030 -0.397940 -2.635522 -1.310557 Pseudomonas Neomycin
9 Salmonella (Eberthella) typhosa 1.000 0.40 0.008 negative 0.000000 -0.397940 -2.096910 -0.311321 1.272415 Sal. typhosa Neomycin
10 Salmonella schottmuelleri 10.000 0.80 0.090 negative 1.000000 -0.096910 -1.045757 -0.934617 -0.154538 Sal. scho. Neomycin
11 Staphylococcus albus 0.007 0.10 0.001 positive -2.154902 -1.000000 -3.000000 1.344240 2.937666 S. albus Neomycin
12 Staphylococcus aureus 0.030 0.03 0.001 positive -1.522879 -1.522879 -3.000000 0.656089 3.015881 S. aureus Neomycin
13 Streptococcus fecalis 1.000 1.00 0.100 positive 0.000000 0.000000 -1.000000 0.046724 0.070802 S. fecalis Neomycin
14 Streptococcus hemolyticus 0.001 14.00 10.000 positive -3.000000 1.146128 1.000000 3.583294 -1.127047 S. hemolyticus Penicilin
15 Streptococcus viridans 0.005 10.00 40.000 positive -2.301030 1.000000 1.602060 3.035960 -1.816007 S. viridans Penicilin
In [24]:
df['BestAntibioticAbbr'] = df['BestAntibiotic'].map(lambda x:x[0])
In [25]:
df.head()
Out[25]:
Bacteria Penicilin Streptomycin Neomycin Gram PenicilinL StreptomycinL NeomycinL MDS_x MDS_y bacteriaAbbr BestAntibiotic BestAntibioticAbbr
0 Aerobacter aerogenes 870.000 1.00 1.600 negative 2.939519 0.000000 0.204120 -2.501401 -1.771387 Aerobacter Streptomycin S
1 Brucella abortus 1.000 2.00 0.020 negative 0.000000 0.301030 -1.698970 0.017111 0.577768 Brucella ABO Neomycin N
2 Brucella anthracis 0.001 0.01 0.007 positive -3.000000 -2.000000 -2.154902 2.377794 3.088192 Brucella ANT Penicilin P
3 Diplococcus pneumoniae 0.005 11.00 10.000 positive -2.301030 1.041393 1.000000 2.896058 -1.290141 Diplococcus Penicilin P
4 Escherichia coli 100.000 0.40 0.100 negative 2.000000 -0.397940 -1.000000 -1.971735 -0.285049 Escherichia Neomycin N
In [26]:
fig = px.scatter(df, x="MDS_x", y="MDS_y", color="BestAntibiotic", 
                text="bacteriaAbbr",
                width=800, height=800)

def add_trace_copy(trace):
    fig.add_traces(trace)
    new_trace = fig.data[-1]
    new_trace.update(textfont_color=trace.marker.color, textposition='top center', 
                     mode="text", showlegend=False)
    trace.update(mode="markers")
fig.for_each_trace(add_trace_copy)

fig.update_xaxes(
    range=[-4,4],  # sets the range of xaxis
   constrain="domain",  # meanwhile compresses the xaxis by decreasing its "domain"
)

fig.update_yaxes(
    scaleanchor = "x",
    scaleratio = 1,
  )



fig.show()